BA9 R Markdown

R version-4.1.0 import datasets file> Import Dataset > From Excel > Import

finalmatrix.xls: count matrix x= erv-loci locations, y= Sample names TREM2MasterAnnotation.xlsx = load meta data

#load install packages
library(dplyr)
library(tidyverse)
library(readxl)
library(DESeq2)
library(tibble)
library(plotly)
library(ggplot2)
library(EnhancedVolcano)
library(sva)
library(DT)
#BA9

cts <- read_excel("~/Genomic Medicine/Project/msc_project/finalmatrix.xls") 
#header = TRUE , sep="\t", fill = TRUE, na.strings = "(NA)")
colnames(cts) <-sub("\\.", "-", colnames(cts))

cts <- as.data.frame(cts)
rownames(cts) <-cts$`erv-id`

#remove all row and col with NA
cts <- cts[(1:789),]

#remove MCI "RDobson-117
#remove gyrus"RDobson-129", "RDobson-130", "RDobson-131", "RDobson-133"
#remove HC "RDobson-132
#remove failed sequencing "RDobson-98"
 #64 remaining

#outliers , low quality , "RDobson-36" 'RDobson-52'

ctsBA9 <- cts %>% select('erv-id', "RDobson-97", "RDobson-22", "RDobson-94", "RDobson-42","RDobson-96", "RDobson-26", "RDobson-36", "RDobson-52", "RDobson-20", "RDobson-43", "RDobson-107", "RDobson-61", "RDobson-95", "RDobson-121", "RDobson-127", "RDobson-114", "RDobson-65", "RDobson-16", "RDobson-102", "RDobson-35", "RDobson-57", "RDobson-51", "RDobson-72", "RDobson-89", "RDobson-13", "RDobson-66", "RDobson-34", "RDobson-60", "RDobson-17", "RDobson-112", "RDobson-32", "RDobson-87", "RDobson-41", "RDobson-64", "RDobson-90", "RDobson-116", "RDobson-81", "RDobson-50", "RDobson-3",  "RDobson-73", "RDobson-77", "RDobson-12", "RDobson-39", "RDobson-110", "RDobson-37", "RDobson-99", "RDobson-59", "RDobson-55", "RDobson-23", "RDobson-124", "RDobson-11", "RDobson-86", "RDobson-108", "RDobson-25", "RDobson-83", "RDobson-24", "RDobson-105", "RDobson-103", "RDobson-75", "RDobson-68", "RDobson-123", "RDobson-79", "RDobson-71") 
#HC and EC

meta <- read_xlsx("~/Genomic Medicine/Project/TREM2MasterAnnotation.xlsx")
## New names:
## * `Subject id` -> `Subject id...1`
## * `Sample ID` -> `Sample ID...3`
## * `Subject id` -> `Subject id...16`
## * `RNA tube labels` -> `RNA tube labels...66`
## * `RNA tube labels` -> `RNA tube labels...67`
## * ...
metaBA9 <- select(meta, `Sample ID...73`, Diagnosis_1, Tissue , `Sex`, `Age (at death)`, `PostMortemDelay (hours)`, `RIN Score`, `No. of E4 alleles`, `Sequencing Pool`, `Braak and Braak stage (modified Braak Stage)`) 

#select only BA9 tissue
metaBA9<-metaBA9[metaBA9$Tissue == 'BA9',]


#correct colnames into readable format
names(metaBA9)[1] <-"Sample"
names(metaBA9)[5] <-"Age_at_death"
names(metaBA9)[6] <-"PostMortem_Delay_hours"
names(metaBA9)[7] <-"RIN_Score"
names(metaBA9)[8] <-"Num_of_E4_alleles"
names(metaBA9)[9] <-"Seq_pool"
names(metaBA9)[10] <-"Braak_Stage"

#define Braak values
#metaBA9$`Braak_Stage` <- replace(metaBA9$`Braak_Stage`, metaBA9$`Braak_Stage` == "TBC", "0")
#metaBA9$`Braak_Stage` <- replace(metaBA9$`Braak_Stage`, metaBA9$`Braak_Stage` == "UnusualNoBraak", "0")
#metaBA9$`Braak_Stage` <- replace(metaBA9$`Braak_Stage`, metaBA9$`Braak_Stage` == "modified Braak BNE II", "II")
#metaBA9$`Braak_Stage` <- replace(metaBA9$`Braak_Stage`, metaBA9$`Braak_Stage` == "HP Tau stage I", "I")

#specify bin values
metaBA9$Num_of_E4_alleles <- as.factor(metaBA9$Num_of_E4_alleles)

#change NA values to 0 
metaBA9[is.na(metaBA9)] <- 0

#place RIN value in quintiles bins 
#metaBA9 = mutate(metaBA9, quantile_rank = ntile(metaBA9$RIN_Score,5))
#metaBA9$quantile_rank<- as.factor(metaBA9$quantile_rank)

#place PostMortem_Delay_hours in quintile bins
#metaBA9 = mutate(metaBA9, quantile_rank = ntile(metaBA9$PostMortem_Delay_hours,5))
#metaBA9$quantile_rank <- as.factor(metaBA9$quantile_rank)

#place Age_at_death in quintile bins
#metaBA9 = mutate(metaBA9, quantile_rank = ntile(metaBA9$Age_at_death,5))
#metaBA9$quantile_rank <- as.factor(metaBA9$quantile_rank)


metaBA9 <- as.data.frame(metaBA9)
#rm RDobson-98, RDobson-117
metaBA9 <- metaBA9[-c(7,10),]
#8,9
#run deseq2
dds <- DESeqDataSetFromMatrix(countData = ctsBA9 ,
                              colData = metaBA9 ,
                              design =~Sex + Age_at_death + RIN_Score + PostMortem_Delay_hours + Num_of_E4_alleles + Seq_pool + Diagnosis_1 ,
                              tidy = TRUE)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
rld <- varianceStabilizingTransformation(dds)
rld_pca <- plotPCA(rld, intgroup=c("Diagnosis_1")) + 
  labs(title = "BA9- PCA plot 2 ") +
  aes(label = colnames(rld))
rld_pca_plotly <- ggplotly(rld_pca)

rld_pca_plotly 
#correct for known batch effects using SVA and ComBat
#"RDobson-36" 'RDobson-52' replaced to observe effect
batch <- metaBA9$Seq_pool
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 1", '1')
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 2", '2')
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 3", '3')
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 4", '4')
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 5", '5')
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 6", '6')
metaBA9$Seq_pool <- as.numeric(metaBA9$Seq_pool)
dat <- as.matrix(ctsBA9[2:ncol(ctsBA9)])

modcombat = model.matrix(~1, data =(as.data.frame(metaBA9$Num_of_E4_alleles))) 

data_adjusted_BA9 <- ComBat_seq(dat, batch=batch, group = modcombat)
## Found 6 batches
## Using null model in ComBat-seq.
## Adjusting for 0 covariate(s) or covariate level(s)
## Estimating dispersions
## Fitting the GLM model
## Shrinkage off - using GLM estimates for parameters
## Adjusting the data
#plot PCA
pca_adjustedBA9 <-prcomp(data_adjusted_BA9)
pca_adjustedBA9<- summary(pca_adjustedBA9)

#select pca 1 and pca 2, make matrix
PCA1 <- pca_adjustedBA9$rotation[,1:2]

#add phenotype info
PCA1 <-cbind(PCA1,metaBA9[c(1,2,4,5,6,7,8,9)])

#shows %pc1
#head(pca_adjustedBA9)

#plot
ggplotly(ggplot(data = PCA1, 
                aes(x=PC1, y=PC2, col= Diagnosis_1))+
           xlab("PC1: 0.91% variance") +
           ylab("PC2: 0.03% variance") +
           geom_point()+
           ggtitle("BA9- SVA adjusted PCA"))
data_adjusted_BA9 <- as.matrix(data_adjusted_BA9)

dds <- DESeqDataSetFromMatrix(countData = data_adjusted_BA9 ,
                       colData = metaBA9 ,
                       design =~  ~Sex + Age_at_death + RIN_Score + PostMortem_Delay_hours + Num_of_E4_alleles + Seq_pool + Diagnosis_1 )
dds <- DESeq(dds)

res <- results(dds, contrast = c("Diagnosis_1","AD", "Control"))
res$ervid <- row.names(res)
#top5BA9 <-c('3182','3597','3397','3937','3203')
BA9_volcanco <- EnhancedVolcano(res, lab= rownames(res),
                x ='log2FoldChange', y = 'padj',
                #selectLab = c(top5BA9),
                ylab = ~-Log[10]~adjusted~italic(P),
                title = "DESeq2 results BA9",
                subtitle = "Differential expression Volcano Plot",
                legendPosition = "bottom",
                xlim = c(-3,3),
                ylim = c(0,5),
                pCutoff = 0.05,
                drawConnectors = TRUE)


#look at res table as a sorted data.frame
datatable(as.data.frame(res))
#write file as csv to export to excel
#write.csv(as.data.frame(res), file="BA9_ERV_loci_results.csv")
# subset values less than the p adjsut value <0.05
resSig <- subset(res, padj < 0.05)

#view top downregulated log fold change
head(resSig[ order(resSig$log2FoldChange), ])
## log2 fold change (MLE): Diagnosis_1 AD vs Control 
## Wald test p-value: Diagnosis_1 AD vs Control 
## DataFrame with 6 rows and 7 columns
##       baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##      <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## 3597   16.8585       -2.07632  0.412420  -5.03448 4.79154e-07 0.000115716
## 3937  304.2506       -1.42632  0.292508  -4.87618 1.08158e-06 0.000130600
## 3916   21.5422       -1.19943  0.355523  -3.37371 7.41623e-04 0.011193879
## 6125   18.3642       -1.15687  0.335708  -3.44607 5.68801e-04 0.009501209
## 3775  102.9523       -1.13890  0.259761  -4.38444 1.16286e-05 0.000702078
## 3316 1324.8878       -1.11382  0.313276  -3.55538 3.77426e-04 0.008039493
##            ervid
##      <character>
## 3597        3597
## 3937        3937
## 3916        3916
## 6125        6125
## 3775        3775
## 3316        3316
#view top upregulated log fold change
head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ])
## log2 fold change (MLE): Diagnosis_1 AD vs Control 
## Wald test p-value: Diagnosis_1 AD vs Control 
## DataFrame with 6 rows and 7 columns
##       baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##      <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## 3743  125.7519       1.851115  0.395071   4.68553 2.79241e-06 0.000224789
## 5937   16.8099       1.571864  0.458043   3.43170 5.99821e-04 0.009657118
## 3505   22.7659       1.391711  0.392027   3.55004 3.85169e-04 0.008039493
## 3397  268.6015       1.095974  0.223821   4.89665 9.74866e-07 0.000130600
## 5980   27.7196       0.968430  0.324233   2.98683 2.81882e-03 0.028194791
## 3557  258.4234       0.951309  0.292603   3.25120 1.14920e-03 0.015001745
##            ervid
##      <character>
## 3743        3743
## 5937        5937
## 3505        3505
## 3397        3397
## 5980        5980
## 3557        3557
#top ERV with pvalue
topGene <- rownames(res)[which.min(res$padj)]

#plot control against AD for ERV loci 3397
ervcount <- counts(dds[c("3397")], normalized = TRUE)
m <- list(counts = as.numeric(ervcount), group = as.factor(dds$Diagnosis_1))
m <- as.tibble(m)

box3397 <- ggplot(m, aes(group, counts)) + 
  geom_boxplot(width = 0.3, outlier.shape = NA) + 
  geom_point(aes(color=group), position=position_jitter(width=0.1, height=0.1)) + 
  labs(x = "Alzhiemer's vs Control", 
       y = "Normalized Read Counts ", 
       title = "Normalized Count Expression of ERVmap loci 3397") + 
         theme(plot.title = element_text(face="bold", size=20))

box3397